In [ ]:
!pip install folium
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: folium in /usr/local/lib/python3.10/dist-packages (0.14.0)
Requirement already satisfied: branca>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from folium) (0.6.0)
Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.10/dist-packages (from folium) (3.1.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from folium) (1.22.4)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from folium) (2.27.1)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.9->folium) (2.1.2)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (1.26.15)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (2022.12.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (2.0.12)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (3.4)
In [117]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import HeatMap
In [121]:
from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving bus_df1.csv to bus_df1.csv
In [157]:
# df_bus = pd.read_csv('bus_df.csv')
df_bus = pd.read_csv('bus_df1.csv')
In [158]:
df_bus.isna().sum()
Out[158]:
business_id         0
name                0
address          5127
city                0
state               0
postal_code        73
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64
In [159]:
df_bus = df_bus.drop(['address', 'postal_code', 'hours'], axis=1)
df_bus = df_bus.dropna(axis=0)
df_bus['categories'] = df_bus['categories'].astype('object')
<ipython-input-159-c570a7e9626a>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bus['categories'] = df_bus['categories'].astype('object')
In [160]:
df_bus.head().T
Out[160]:
0 1 2 3 4
business_id Pns2l4eNsfO8kk83dixA6A mpf3x-BjTdTEA3yCZrAYPw tUFrWirKiKi_TAnsVWINQQ MTSW4McQd7CbVtyjqoe9mw mWMc6_wTdE0EUBKIGXDVfA
name Abby Rappoport, LAC, CMQ The UPS Store Target St Honore Pastries Perkiomen Valley Brewery
city Santa Barbara Affton Tucson Philadelphia Green Lane
state CA MO AZ PA PA
latitude 34.426679 38.551126 32.223236 39.955505 40.338183
longitude -119.711197 -90.335695 -110.880452 -75.155564 -75.471659
stars 5.0 3.0 3.5 4.0 4.5
review_count 7 15 22 80 13
is_open 0 1 0 1 1
attributes {'ByAppointmentOnly': 'True'} {'BusinessAcceptsCreditCards': 'True'} {'BikeParking': 'True', 'BusinessAcceptsCredit... {'RestaurantsDelivery': 'False', 'OutdoorSeati... {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
categories Doctors, Traditional Chinese Medicine, Naturop... Shipping Centers, Local Services, Notaries, Ma... Department Stores, Shopping, Fashion, Home & G... Restaurants, Food, Bubble Tea, Coffee & Tea, B... Brewpubs, Breweries, Food
In [161]:
df_bus.describe()
Out[161]:
latitude longitude stars review_count is_open
count 136601.000000 136601.000000 136601.000000 136601.000000 136601.000000
mean 36.675350 -89.271785 3.623319 48.083623 0.785858
std 5.850294 14.855670 0.943832 126.477539 0.410227
min 27.555127 -120.095137 1.000000 5.000000 0.000000
25% 32.192213 -90.348605 3.000000 8.000000 1.000000
50% 38.778279 -86.120708 4.000000 16.000000 1.000000
75% 39.953936 -75.409757 4.500000 41.000000 1.000000
max 53.679197 -73.200457 5.000000 7568.000000 1.000000
In [162]:
df_bus.describe(include='object')
Out[162]:
business_id name city state attributes categories
count 136601 136601 136601 136601 136601 136601
unique 136601 103432 1347 27 87661 78614
top Pns2l4eNsfO8kk83dixA6A Starbucks Philadelphia PA {'BusinessAcceptsCreditCards': 'True'} Beauty & Spas, Nail Salons
freq 1 714 13399 31080 9385 950
In [163]:
#Heat map of all the restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_bus.iterrows()]
HeatMap(heat_data).add_to(map)
map
Out[163]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [164]:
df_open = df_bus[df_bus['is_open'] == 1]
df_closed = df_bus[df_bus['is_open'] == 0]
In [165]:
#Heatmap of open restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_open.iterrows()]
HeatMap(heat_data).add_to(map)
map
Out[165]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [166]:
#Heat map of closed restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_closed.iterrows()]
HeatMap(heat_data).add_to(map)
map
Out[166]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [167]:
print("Count of restaurants that are open {}".format(len(df_open)))
print("Count of restaurants that are closed {}".format(len(df_closed)))
sns.barplot(x=['Open Restaurant', 'Closed Restaurant'], y=[len(df_open), len(df_closed)])
Count of restaurants that are open 107349
Count of restaurants that are closed 29252
Out[167]:
<Axes: >
In [168]:
# df = df_bus.groupby(['city', 'is_open']).count()
# df.reset_index(inplace=True)
# df.head()
In [169]:
#Getting the ratio of open restaurants over total restaurants by each state
df = df_bus.groupby('state')['is_open'].sum()/df_bus['state'].value_counts()
df = df.reset_index()
df = df.rename(columns={'index': 'state', 0: 'ratio'})
In [170]:
url = ("https://raw.githubusercontent.com/python-visualization/folium/main/examples/data")
state_geo = f"{url}/us-states.json"
ratio_open_map = folium.Map(location=[48, -102], zoom_start=4)

bins = list(df["ratio"].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data=df,
    columns=["state", 'ratio'],
    key_on="feature.id",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.2,
    bins = bins,
    legend_name="Open Restuarants VS total restuarants (%)",
).add_to(ratio_open_map)

folium.LayerControl().add_to(ratio_open_map)
ratio_open_map
Out[170]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [171]:
df_bus.head().T
Out[171]:
0 1 2 3 4
business_id Pns2l4eNsfO8kk83dixA6A mpf3x-BjTdTEA3yCZrAYPw tUFrWirKiKi_TAnsVWINQQ MTSW4McQd7CbVtyjqoe9mw mWMc6_wTdE0EUBKIGXDVfA
name Abby Rappoport, LAC, CMQ The UPS Store Target St Honore Pastries Perkiomen Valley Brewery
city Santa Barbara Affton Tucson Philadelphia Green Lane
state CA MO AZ PA PA
latitude 34.426679 38.551126 32.223236 39.955505 40.338183
longitude -119.711197 -90.335695 -110.880452 -75.155564 -75.471659
stars 5.0 3.0 3.5 4.0 4.5
review_count 7 15 22 80 13
is_open 0 1 0 1 1
attributes {'ByAppointmentOnly': 'True'} {'BusinessAcceptsCreditCards': 'True'} {'BikeParking': 'True', 'BusinessAcceptsCredit... {'RestaurantsDelivery': 'False', 'OutdoorSeati... {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
categories Doctors, Traditional Chinese Medicine, Naturop... Shipping Centers, Local Services, Notaries, Ma... Department Stores, Shopping, Fashion, Home & G... Restaurants, Food, Bubble Tea, Coffee & Tea, B... Brewpubs, Breweries, Food
In [172]:
#The data is not well organized, Some categories contain 'restaurants, pizza' other contains 'pizza, restaurants' and they are not treated the same 
df = df_bus.categories.unique()
len(df)
Out[172]:
78614
In [173]:
df_bus.categories.dtype
Out[173]:
dtype('O')
In [174]:
unique_restaurants_list = []
unique_restaurants_list_count = []
for row in df:
    if "," in row:
        words_list = list(row.split(","))
    else:
         words_list = list(row)
    for word in words_list:
        word = word.strip()
        word = word.lower()
        try:
            index = unique_restaurants_list.index(word)
        except:
            index = -1

        if index == -1:
            unique_restaurants_list.append(word)
            unique_restaurants_list_count.append(1)
        else:
            unique_restaurants_list_count[index] += 1
In [175]:
df = pd.DataFrame({"Category": unique_restaurants_list,"Count":unique_restaurants_list_count})
In [176]:
#Checking if the incorrect written categories have an effect on the data by checking their count
df[df['Category'].apply(lambda x: len(x) < 2)]
Out[176]:
Category Count
469 l 13
470 o 11
471 c 9
472 a 15
473 18
474 s 15
475 e 26
476 r 11
477 v 10
478 i 14
531 t 16
532 f 4
763 h 6
764 m 5
855 u 4
856 n 12
902 d 2
933 g 4
934 b 2
935 y 1
936 & 6
937 p 5
In [177]:
df = df.sort_values(by='Count', ascending=False)
print(len(df))
df.head()
1326
Out[177]:
Category Count
17 restaurants 31389
18 food 19315
12 shopping 17654
36 nightlife 10452
34 bars 9492
In [178]:
#Extracting top 20 categories to mark them as the most popular categories
df = df.iloc[2:]
df = df.head(20)
df.head()
Out[178]:
Category Count
12 shopping 17654
36 nightlife 10452
34 bars 9492
154 home services 8486
7 local services 7572
In [179]:
# Define the number of rows to display in each chart
rows_per_chart = 10

# Calculate the number of charts needed
num_charts = -(-len(df) // rows_per_chart)  # Equivalent to math.ceil(len(df) / rows_per_chart)

# Define the number of rows and columns for the subplots
nrows = 1
ncols = num_charts

# Create the subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5))

for i, ax in enumerate(axes):
    # Select the data for the current chart
    chart_data = df.iloc[i * rows_per_chart:(i + 1) * rows_per_chart]

    # Create a bar chart using Seaborn on the current subplot
    sns.barplot(y='Category', x='Count', data=chart_data, ax=ax)

    # Set the chart title
    ax.set_title(f'Chart {i + 1}')

# Adjust the layout
plt.tight_layout()

# Show the charts
plt.show()
In [180]:
#create a column for popular categories
df_bus['popular_category'] = df_bus['categories'].apply(lambda x: 1 if any(substring.lower() in x.lower() for substring in df['Category']) else 0)
df_bus.head()
Out[180]:
business_id name city state latitude longitude stars review_count is_open attributes categories popular_category
0 Pns2l4eNsfO8kk83dixA6A Abby Rappoport, LAC, CMQ Santa Barbara CA 34.426679 -119.711197 5.0 7 0 {'ByAppointmentOnly': 'True'} Doctors, Traditional Chinese Medicine, Naturop... 1
1 mpf3x-BjTdTEA3yCZrAYPw The UPS Store Affton MO 38.551126 -90.335695 3.0 15 1 {'BusinessAcceptsCreditCards': 'True'} Shipping Centers, Local Services, Notaries, Ma... 1
2 tUFrWirKiKi_TAnsVWINQQ Target Tucson AZ 32.223236 -110.880452 3.5 22 0 {'BikeParking': 'True', 'BusinessAcceptsCredit... Department Stores, Shopping, Fashion, Home & G... 1
3 MTSW4McQd7CbVtyjqoe9mw St Honore Pastries Philadelphia PA 39.955505 -75.155564 4.0 80 1 {'RestaurantsDelivery': 'False', 'OutdoorSeati... Restaurants, Food, Bubble Tea, Coffee & Tea, B... 1
4 mWMc6_wTdE0EUBKIGXDVfA Perkiomen Valley Brewery Green Lane PA 40.338183 -75.471659 4.5 13 1 {'BusinessAcceptsCreditCards': 'True', 'Wheelc... Brewpubs, Breweries, Food 0
In [181]:
#Ratio of places that has categories within the top 20 most popular catgories
df_bus['popular_category'].sum()/len(df_bus)
Out[181]:
0.823244339353299
In [182]:
df = pd.DataFrame(df_bus.groupby(['is_open','popular_category'])['popular_category'].agg('count'))
df = df.rename(columns={'popular_category': 'Category_count'})
df = df.reset_index()
In [183]:
df
Out[183]:
is_open popular_category Category_count
0 0 0 8097
1 0 1 21155
2 1 0 16048
3 1 1 91301
In [184]:
#barplot between is it open or not based on ctegory
sns.barplot(x='is_open', y='Category_count', hue='popular_category', data=df)
Out[184]:
<Axes: xlabel='is_open', ylabel='Category_count'>
In [185]:
df_bus.iloc[0].attributes
df_bus.attributes.unique()
Out[185]:
array(["{'ByAppointmentOnly': 'True'}",
       "{'BusinessAcceptsCreditCards': 'True'}",
       '{\'BikeParking\': \'True\', \'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsPriceRange2\': \'2\', \'CoatCheck\': \'False\', \'RestaurantsTakeOut\': \'False\', \'RestaurantsDelivery\': \'False\', \'Caters\': \'False\', \'WiFi\': "u\'no\'", \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'WheelchairAccessible\': \'True\', \'HappyHour\': \'False\', \'OutdoorSeating\': \'False\', \'HasTV\': \'False\', \'RestaurantsReservations\': \'False\', \'DogsAllowed\': \'False\', \'ByAppointmentOnly\': \'False\'}',
       ...,
       '{\'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsPriceRange2\': \'2\', \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'True\', \'WiFi\': "u\'no\'"}',
       '{\'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'True\', \'RestaurantsPriceRange2\': \'4\', \'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsTakeOut\': \'None\', \'RestaurantsDelivery\': \'None\'}',
       '{\'WheelchairAccessible\': \'True\', \'BusinessAcceptsBitcoin\': \'False\', \'RestaurantsPriceRange2\': \'1\', \'BusinessAcceptsCreditCards\': \'True\', \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'False\', \'WiFi\': "u\'free\'", \'ByAppointmentOnly\': \'False\'}'],
      dtype=object)
In [186]:
#extracting features
import ast
unique_values = []
for index, row in df_bus.iterrows():
    dictionary = ast.literal_eval(row.attributes)
    for (key, value) in dictionary.items():
        key = key.lower()
        
        if (value == 'True'):
            if key not in unique_values:
                unique_values.append(key)
                df_bus[key]= 0
            df_bus.loc[index,key]=1

        elif (value == 'False' or value == 'None'):
            if key not in unique_values:
                unique_values.append(key)
                df_bus[key]= 0
        else:
            try:
                dictionary1 = ast.literal_eval(value)
                for (key1, value1) in dictionary1.items():
                    key1 = key1.lower()
                    if (value == 'True'):
                        if key1 not in unique_values:
                            unique_values.append(key1)
                            df_bus[key1]= 0
                        df_bus.loc[index,key1]=1
                    elif (value == 'False' or value == 'None'):
                        if key1 not in unique_values:
                            unique_values.append(key1)
                            df_bus[key1]= 0
            except:
                continue
In [187]:
df_bus.head()
Out[187]:
business_id name city state latitude longitude stars review_count is_open attributes ... noiselevel open24hours restaurantscounterservice bestnights hairspecializesin music alcohol restaurantsattire byobcorkage dietaryrestrictions
0 Pns2l4eNsfO8kk83dixA6A Abby Rappoport, LAC, CMQ Santa Barbara CA 34.426679 -119.711197 5.0 7 0 {'ByAppointmentOnly': 'True'} ... 0 0 0 0 0 0 0 0 0 0
1 mpf3x-BjTdTEA3yCZrAYPw The UPS Store Affton MO 38.551126 -90.335695 3.0 15 1 {'BusinessAcceptsCreditCards': 'True'} ... 0 0 0 0 0 0 0 0 0 0
2 tUFrWirKiKi_TAnsVWINQQ Target Tucson AZ 32.223236 -110.880452 3.5 22 0 {'BikeParking': 'True', 'BusinessAcceptsCredit... ... 0 0 0 0 0 0 0 0 0 0
3 MTSW4McQd7CbVtyjqoe9mw St Honore Pastries Philadelphia PA 39.955505 -75.155564 4.0 80 1 {'RestaurantsDelivery': 'False', 'OutdoorSeati... ... 0 0 0 0 0 0 0 0 0 0
4 mWMc6_wTdE0EUBKIGXDVfA Perkiomen Valley Brewery Green Lane PA 40.338183 -75.471659 4.5 13 1 {'BusinessAcceptsCreditCards': 'True', 'Wheelc... ... 0 0 0 0 0 0 0 0 0 0

5 rows × 50 columns

In [188]:
df = df_bus.groupby('is_open')['stars'].mean()
df = df.reset_index()
# Create batchart
sns.set_style("whitegrid")
sns.set_palette("Set2")

sns.barplot(x='is_open', y='stars', data=df)
Out[188]:
<Axes: xlabel='is_open', ylabel='stars'>
In [189]:
df = df_bus.drop(['business_id', 'name', 'city', 'state',  'latitude', 'longitude', 'attributes', 'categories'], axis=1)
In [190]:
for column in df.columns:
    if (df[column].sum()==0):
        df = df.drop([column], axis=1)
In [191]:
# Calculate the correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
# Create a correlation heatmap using Seaborn
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')

# Set the title of the heatmap
plt.title('Correlation Heatmap')

# Display the heatmap
plt.show()
In [191]:
 
In [ ]: